#Loading packages
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from IPython.display import HTML
#Loading dataset
df = pd.read_csv("C:/Users/Nathan/Documents/Portfolio/Portfolio Data/World Population EDA/world_population.csv")
#Determining the number of rows and columns in the data
df.shape
(234, 17)
df.head(5)
| Rank | CCA3 | Country | Capital | Continent | 2022 Population | 2020 Population | 2015 Population | 2010 Population | 2000 Population | 1990 Population | 1980 Population | 1970 Population | Area (km²) | Density (per km²) | Growth Rate | World Population Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 | AFG | Afghanistan | Kabul | Asia | 41128771 | 38972230 | 33753499 | 28189672 | 19542982 | 10694796 | 12486631 | 10752971 | 652230 | 63.0587 | 1.0257 | 0.52 |
| 1 | 138 | ALB | Albania | Tirana | Europe | 2842321 | 2866849 | 2882481 | 2913399 | 3182021 | 3295066 | 2941651 | 2324731 | 28748 | 98.8702 | 0.9957 | 0.04 |
| 2 | 34 | DZA | Algeria | Algiers | Africa | 44903225 | 43451666 | 39543154 | 35856344 | 30774621 | 25518074 | 18739378 | 13795915 | 2381741 | 18.8531 | 1.0164 | 0.56 |
| 3 | 213 | ASM | American Samoa | Pago Pago | Oceania | 44273 | 46189 | 51368 | 54849 | 58230 | 47818 | 32886 | 27075 | 199 | 222.4774 | 0.9831 | 0.00 |
| 4 | 203 | AND | Andorra | Andorra la Vella | Europe | 79824 | 77700 | 71746 | 71519 | 66097 | 53569 | 35611 | 19860 | 468 | 170.5641 | 1.0100 | 0.00 |
The dataset contains population information on 234 different countries and territories within 17 variables:
We can start by ensuring that every column is complete and contains no missing values which can be done by counting the number of instances that occur in each variable column. A quick and easy way to do this is by plotting this result in a bar chart.
#Check each column in the data for missing values
missing = df.notnull().sum(axis=0)
#Generate Bar plot for each variable in the dataset counting the number of non-missing instances
fig = px.bar(x=missing.index,
y=missing.values,
text=missing.values, #include the number of non-missing values inside each bar on the plot for clarity
title='World Population Dataset: Total Number of Data Points (out of 234 rows)'
)
fig.update_traces(hovertemplate='<br> Variable: %{x} </br> Number of Instances: %{y}')
fig.update_layout(xaxis_title='Dataset Variables',yaxis_title='Number of Instances')
fig.show()
Since every column contains 234 instances, we can assume there are no missing values in the dataset
To better understand the dataset, we may wish to know how many countries/territories are being accounted for in each continent
#Count the number of countries/territories in each continent
country_counts = df.groupby('Continent')['Country'].count()
#Generate
fig = px.bar(x=country_counts.index,
y=country_counts.values,
color=country_counts.index, #add color to easily differentiate between each continent
text=country_counts.values, #include the number of countries/territories in each continent on bar plot
color_discrete_sequence=px.colors.sequential.Peach[::-1],
title='World Population Dataset: Countries per Continent')
fig.update_traces(hovertemplate='<br> Continent: %{x} </br> Number of Countries Included: %{y}') #edit hovertext for clarity
fig.update_layout(xaxis_title='Continents',yaxis_title='Number of Countries')
fig.show()
The data contains two variables which have a clear link to one another as well as population size: Area (km²) and Density (per km²)
fig = px.choropleth(data_frame=df,
locations='Country',
locationmode='country names',
color='Area (km²)',
color_continuous_scale=px.colors.sequential.Bluyl,
title='Total Land Mass by Country',
template='ggplot2',
projection='natural earth'
)
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
fig.show()
In a direct comparison, it helps to look at the population density in a similar visual to see if any obvious relations exist
fig = px.choropleth(data_frame=df,
locations='Country',
locationmode='country names',
color='Density (per km²)',
color_continuous_scale=px.colors.sequential.OrRd,
title='Total Population Density by Country',
template='ggplot2',
range_color=(0,150),
projection='natural earth'
)
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
example = fig
fig.show()
Lets begin looking at the current world population numbers and proportions by first inspecting each continents contribution:
#Find the average population by continent and sort values in a descending order
continent_populations = df.groupby('Continent')['2022 Population'].mean().round().sort_values(ascending=False)
#Generate a bar plot to visual each continent's average population
fig=px.bar(x=continent_populations.index,
y=continent_populations.values,
color=continent_populations, #color each continents bar according to its population
text=continent_populations.values, #include average population amount on each bar for clarity
color_continuous_scale=px.colors.sequential.Darkmint,
title='Average 2022 Population by Continent'
)
fig.update_layout(xaxis_title='Continents',
yaxis_title='Average Population Count')
fig.update_traces(hovertemplate='<br> Continent: %{x} </br> Average Population: %{y}') #update hover information for inspection
fig.show()
#Generate a pie chart to visualize the proportion each continent contributes to the current total world population
fig = px.pie(df,
values='2022 Population',
names='Continent',
color_discrete_sequence=px.colors.sequential.Magenta[::-1],
title='Total 2022 Population by Continent',
hole=0.25
)
fig.update_traces(textinfo='label+percent+value') #update each slices information for better readability
fig.show()
To get a better idea of how the average and total populations for each continent got to the current levels, we can inspect the changes over the past 50 years in similar plots:
#Generate a list to cycle through the past 50 years populations by decades
populations = ['2020 Population','2010 Population','2000 Population','1990 Population','1980 Population','1970 Population']
#Reverse the list to go chronologically
populations.reverse()
#Using a for loop, plot a bar and pie chart for the average population and total population for each continents over the decades
for pop in populations:
fig = make_subplots(rows=1,cols=2,specs=[[{'type':'xy'},{'type':'domain'}]], #Generate a subplot matrix for each decade
subplot_titles = ['Average '+pop,'Total '+pop],
horizontal_spacing=0.2,column_widths=[0.7,0.3], y_title='Per Country/Territory')
fig.add_trace( #Generate a bar plot of the average population for each country by continent
go.Bar(x=continent_populations.index,
y=continent_populations.values,
showlegend=False,
name='',
marker=dict(color=px.colors.sequential.Darkmint[::-1]),
texttemplate='%{y}'
),
row=1,
col=1
)
fig.add_trace( #Generate a pie chart of the total population within each continent including total percentage and amount
go.Pie(values = df.groupby('Continent')[pop].sum().sort_values(ascending=False),
labels = df.groupby('Continent')[pop].sum().sort_values(ascending=False).index,
showlegend=False,
name='',
marker=dict(colors=px.colors.sequential.Magenta[::-1]),
texttemplate='<br>%{label}</br>%{percent}</br>%{value}',
textposition='outside',
hole=0.25
),
row=1,
col=2
)
fig.update_layout(title_text=pop+' Demographics by Continent')
fig.show()
After looking at each continent as a whole, we can identify which countries and territories have contributed the most and least to the current world population totals:
#Calculate the top 5 most populated countries/territories in 2022
mostpop_country = df.groupby('Country')['2022 Population'].sum().sort_values(ascending=False).head(5)
#Generate a bar plot of the top 5 most populated countries of 2022
fig = px.bar(x=mostpop_country.index,
y=mostpop_country.values,
color_discrete_sequence = ['MidnightBlue'],
text=mostpop_country.values,
title='Top 5 Most Populated Countries According to 2022 Population'
)
#Update axis titles for clarity
fig.update_layout(xaxis_title='Country', yaxis_title='Population')
#Update hover information for addition clarity
fig.update_traces(hovertemplate='<br> Country: %{x} </br> 2022 Population: %{y}')
fig.show()
#Calculate the top 5 least populated countries/territories of 2022
leastpop_country = df.groupby('Country')['2022 Population'].sum().sort_values(ascending=True).head(5)
leastpop_country = leastpop_country.sort_values(ascending=False)
#Generate a bar plot of the top 5 least populated countries/territories of 2022
fig = px.bar(x=leastpop_country.index,
y=leastpop_country.values,
color_discrete_sequence = ['IndianRed'],
text=leastpop_country.values,
title= 'Top 5 Least Populated Countires According to 2022 Population'
)
#Update axis titles for clarity
fig.update_layout(xaxis_title='Country', yaxis_title='Population')
#Update hover information for additional clarity
fig.update_traces(hovertemplate='<br> Country: %{x} </br> 2022 Population: %{y}')
fig.show()
We can try to determine if there are any changes in these rankings by looking at the most and least populated countries and territories of the past 50 years:
#Generate list of decades to cycle through for plotting
populations = ['2020 Population','2010 Population','2000 Population','1990 Population','1980 Population','1970 Population']
#Set list to chronological order
populations.reverse()
#Using a for loop, plot the top 5 least and most populated countries/territories for each decade over the past 50 years
for pop in populations:
#Calculate the least and most populated countries/territories of each decade
mostpop_country = df.groupby('Country')[pop].sum().sort_values(ascending=False).head(5)
leastpop_country = df.groupby('Country')[pop].sum().sort_values(ascending=True).head(5)
leastpop_country = leastpop_country.sort_values(ascending=False)
#Generate subplot matrix for each decade
fig = make_subplots(rows=1,cols=2,specs=[[{'type':'xy'},{'type':'xy'}]],
subplot_titles=['Top 5 Most Populated Countries','Top 5 Least Populated Countries'],
y_title='Population', x_title='Countries')
#Generate bar chart for top 5 most populated countries of the decade
fig.add_trace(
go.Bar(x=mostpop_country.index,
y=mostpop_country.values,
texttemplate='%{y}',
name='',
showlegend=False,
marker=dict(color='MidnightBlue')
),
row=1,
col=1
)
#Generate bar chart for top 5 least populate countries of the decade
fig.add_trace(
go.Bar(x=leastpop_country.index,
y=leastpop_country.values,
texttemplate='%{y}',
name='',
showlegend=False,
marker=dict(color='IndianRed')
),
row=1,
col=2
)
#Include a main title indicating which decade the subplots are refering to
fig.update_layout(title_text=pop)
fig.show()
Since the data contains information on the world population broken down by each country and territory's contribution, it may be easier to visualize each countries impact by viewing the information on a choropleth map. This makes it possible to view the aggregated population data in a geographic way.
Lets start by looking at a choropleth map of the current 2022 world population
fig = px.choropleth(data_frame=df,
locations='Country',
locationmode='country names',
color='2022 Population',
color_continuous_scale=px.colors.sequential.RdBu[::-1],
range_color=(0,1000000000),
title='World Map Indicating 2022 Populations',
template='ggplot2',
projection='natural earth')
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
We can also view a choropleth map of the world population over the past 50 years and see how the population numbers change over the decades through an animation.
#List of the past 5 decades to inspect
years = ['2020 Population','2010 Population', '2000 Population', '1990 Population','1980 Population', '1970 Population']
#Reverse order to see oldest to most recent
years.reverse()
#Copy data
country_pop_year = df
#Drop irrelevant columns
country_pop_year = country_pop_year.drop(['Area (km²)', 'Density (per km²)', 'Growth Rate','World Population Percentage'],
axis=1)
#Reconfigure data to have all different populations year data in one column with another column 'year' as identifier
country_pop_year = pd.melt(country_pop_year,id_vars=['Country'],value_vars=years)
#Rename new data set columns to match contents
country_pop_year = country_pop_year.rename(columns={'variable':'Year','value':'Population'})
#Remove ' Population' from indentifier column so it only include year (1970, 1980, etc.)
country_pop_year['Year'] = country_pop_year['Year'].str.replace(' Population','')
#Generate a choropleth animation showing the change in country/territory populations for each decade from 1970-2020
fig = px.choropleth(data_frame=country_pop_year,
locations='Country', #Column used to identify countries
locationmode='country names',
color='Population', #Column used to identify color intensity
animation_frame='Year', #Column used to idenify each animation frame
color_continuous_scale=px.colors.sequential.RdBu[::-1],
range_color=(0,1000000000), #set limit on colorscale, India and China are so large the rest seem unchanged
title='World Map Indicating Populations: 1970 - 2020',
template='ggplot2',
projection='natural earth') #change map projection type to Winkel Tripel
#Include longitude, latitude, and water markings for clarify and visual appeal
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
Since nearly half of the current world population reside in China and India, it makes sense to isolate Asia and take a closer look at their changes over the past 50 years.
#Generate the same choropleth map as before, this time changing the scope of the map from the entire world to only Asia
fig = px.choropleth(data_frame=country_pop_year,
locations='Country',
locationmode='country names',
color='Population',
animation_frame='Year',
color_continuous_scale=px.colors.sequential.RdBu[::-1],
range_color=(0,1000000000),
title='Asia Population Density Map: 1970 - 2020',
template='ggplot2',
scope = 'asia')
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
fig.show()